import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from scipy import stats
import sys
sys.path.append(sys.argv[1])

import pandas as pd  
import pickle
  
# Load the dataset  
credit_customers = pd.read_csv(os.path.join(sys.argv[1], 'credit_customers.csv'))  
  
# Check for missing values  
missing_values = credit_customers.isnull().sum()  

print("Missing values per column:\n", missing_values)    
# pickle.dump(missing_values,open("./ref_result/missing_values.pkl","wb"))


import pandas as pd  
import pickle
  
# Load the dataset  
  
# Identify categorical variables  
categorical_columns = credit_customers.select_dtypes(include=['object']).columns  
  
# Initialize an empty list to store nominal categorical columns  
nominal_categorical_columns = []  
  
# Iterate through the categorical columns and append them to the nominal_categorical_columns list  
for col in categorical_columns:  
    nominal_categorical_columns.append(col)  
  
# Suggest encoding methods  
print("Suggested encoding methods:")  
print("1. One-hot encoding for nominal categorical variables:")  
for col in nominal_categorical_columns:  
    print("   -", col)  
  
# Example code for one-hot encoding  
credit_customers_encoded = pd.get_dummies(credit_customers, columns=nominal_categorical_columns)  
print("\nData after one-hot encoding:\n", credit_customers_encoded.head()) 
# pickle.dump(credit_customers_encoded,open("./ref_result/credit_customers_encoded.pkl","wb"))


import pandas as pd  
import numpy as np
from sklearn.preprocessing import StandardScaler 
import pickle
  
# Load the dataset  
  
# Identify numerical variables  
numerical_columns = credit_customers.select_dtypes(include=['number']).columns  
  
# Calculate the range of each numerical column  
ranges = {}  
for col in numerical_columns:  
    ranges[col] = credit_customers[col].max() - credit_customers[col].min()  
  
# Check if normalization is necessary  
normalization_needed = False  
for col, range_ in ranges.items():  
    if range_ > 1:  
        normalization_needed = True  
        break  
  
# It's true:  
# Normalize the data using Standard Scaling  
scaler = StandardScaler()  
credit_customers_normalized = credit_customers.copy()  
credit_customers_normalized[numerical_columns] = scaler.fit_transform(credit_customers[numerical_columns])  
  
print("Data after Standard Scaling:\n", credit_customers_normalized.head())  
# pickle.dump(credit_customers_normalized.head(),open("./ref_result/credit_customers_normalized_head.pkl","wb"))


import pandas as pd   
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt 
  
# Load the dataset  
  
# Extract the important columns
important_columns = ['credit_history', 'age', 'employment', 'credit_amount', 'savings_status']
data_for_clustering = credit_customers[important_columns].copy()

# Apply Label Encoding to 'savings_status' and 'employment'
le_savings_status = LabelEncoder()
le_employment = LabelEncoder()

data_for_clustering['savings_status'] = le_savings_status.fit_transform(data_for_clustering['savings_status'])
data_for_clustering['employment'] = le_employment.fit_transform(data_for_clustering['employment'])

# Apply One-Hot Encoding to 'credit_history'
data_for_clustering = pd.get_dummies(data_for_clustering, columns=['credit_history'], drop_first=True)

# Normalize the data using Standard Scaling
scaler = StandardScaler()
data_for_clustering_scaled = scaler.fit_transform(data_for_clustering)
# Determine the sum of squared distances for different number of clusters
# Calculate silhouette scores for different number of clusters
silhouette_scores = []

for cluster_num in range(2, 15):  # Starting from 2 because silhouette score is not defined for 1 cluster
    kmeans = KMeans(n_clusters=cluster_num, random_state=42)
    cluster_labels = kmeans.fit_predict(data_for_clustering_scaled)
    silhouette_avg = silhouette_score(data_for_clustering_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 15), silhouette_scores, marker='o', linestyle='--')
plt.title('Silhouette Scores for Different Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.savefig('ref_result/Silhouette_Scores.png') 
# plt.show()

import pandas as pd  
from sklearn.preprocessing import StandardScaler, LabelEncoder  
from sklearn.cluster import KMeans  
import pickle
  
# Load the dataset  
  
# Extract the important columns  
important_columns = ['credit_history', 'age', 'employment', 'credit_amount', 'savings_status']  
data_for_clustering = credit_customers[important_columns].copy()  
  
# Preprocess the data  
data_for_clustering['savings_status'] = LabelEncoder().fit_transform(data_for_clustering['savings_status'])  
data_for_clustering['employment'] = LabelEncoder().fit_transform(data_for_clustering['employment'])  
data_for_clustering = pd.get_dummies(data_for_clustering, columns=['credit_history'], drop_first=True)  
  
# Normalize the data  
data_for_clustering_scaled = StandardScaler().fit_transform(data_for_clustering)  
  
# Perform K-means clustering  
kmeans = KMeans(n_clusters=4, random_state=42)  
cluster_labels = kmeans.fit_predict(data_for_clustering_scaled)  
  
# Add the cluster labels to the original dataset  
credit_customers['cluster'] = cluster_labels  
  
# Return the clustering algorithm used, parameters chosen, and resulting customer segments  
result = ("K-means", {"n_clusters": 4, "random_state": 42}, credit_customers[['credit_history', 'age', 'employment', 'credit_amount', 'savings_status', 'cluster']])  
  
print("result:\n", result)  
# pickle.dump(result,open("./ref_result/result.pkl","wb"))


import pandas as pd  
from sklearn.preprocessing import StandardScaler, LabelEncoder  
from sklearn.cluster import KMeans  
import pickle
  
# Load the dataset  
  
# Extract the important columns  
important_columns = ['credit_history', 'age', 'employment', 'credit_amount', 'savings_status']  
data_for_clustering = credit_customers[important_columns].copy()  
  
# Preprocess the data  
data_for_clustering['savings_status'] = LabelEncoder().fit_transform(data_for_clustering['savings_status'])  
data_for_clustering['employment'] = LabelEncoder().fit_transform(data_for_clustering['employment'])  
data_for_clustering = pd.get_dummies(data_for_clustering, columns=['credit_history'], drop_first=True)  
  
# Normalize the data  
data_for_clustering_scaled = StandardScaler().fit_transform(data_for_clustering)  
  
# Perform K-means clustering  
kmeans = KMeans(n_clusters=4, random_state=42)  
cluster_labels = kmeans.fit_predict(data_for_clustering_scaled)  
  
# Add the cluster labels to the original dataset  
credit_customers['cluster'] = cluster_labels  
  
# Filter the dataset based on the specified criteria    
good_credit_history = credit_customers['credit_history'].isin(['existing paid', 'all paid'])    
age_group = (credit_customers['age'] >= 25) & (credit_customers['age'] <= 45)    
stable_employment = credit_customers['employment'].isin(['>=7', '4<=X<7'])    
  
# Combine the filters and apply them to the dataset    
target_customers = credit_customers[good_credit_history & age_group & stable_employment]    
  
# Extract the target customer segments    
target_customer_segments = target_customers['cluster'].unique().tolist()    
  
# Return the list of target customer segments     
print("target_customer_segments:\n", target_customer_segments)  
# pickle.dump(target_customer_segments,open("./ref_result/target_customer_segments.pkl","wb"))


import pandas as pd  
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder  
from sklearn.cluster import KMeans  
import pickle
  
# Load the dataset  
  
# Extract the important columns  
important_columns = ['credit_history', 'age', 'employment', 'credit_amount', 'savings_status']  
data_for_clustering = credit_customers[important_columns].copy()  
  
# Preprocess the data by applying Label Encoding to 'savings_status' and 'employment'  
data_for_clustering['savings_status'] = LabelEncoder().fit_transform(data_for_clustering['savings_status'])  
data_for_clustering['employment'] = LabelEncoder().fit_transform(data_for_clustering['employment'])  
  
# Apply One-Hot Encoding to 'credit_history'  
data_for_clustering = pd.get_dummies(data_for_clustering, columns=['credit_history'], drop_first=True)  
  
# Normalize the data using Standard Scaling  
data_for_clustering_scaled = StandardScaler().fit_transform(data_for_clustering)  
  
# Perform K-means clustering with 4 clusters  
kmeans = KMeans(n_clusters=4, random_state=42)  
cluster_labels = kmeans.fit_predict(data_for_clustering_scaled)  
    
# Add the cluster labels to the original dataset  
credit_customers['cluster'] = cluster_labels  
     
# Identify additional customer segments with potential for responsiveness to promotions and financing options  
# Criteria: customers with a good credit history and a high credit amount
good_credit_history = credit_customers['credit_history'].isin(['no credits/all paid'])  
high_credit_amount = credit_customers['credit_amount'] > credit_customers['credit_amount'].quantile(0.75)  
  
# Combine the filters and apply them to the dataset  
potential_customers = credit_customers[good_credit_history & high_credit_amount]  
  
# Extract the additional customer segments  
additional_customer_segments = potential_customers['cluster'].unique().tolist()  
  
# Exclude the target customer segments identified in the previous step  
target_customer_segments = [1, 2]  # Replace this list with the target customer segments from the previous step  
additional_customer_segments = [segment for segment in additional_customer_segments if segment not in target_customer_segments]  
  
# Return the list of additional customer segments     
print("additional_customer_segments:\n", additional_customer_segments)  
# pickle.dump(additional_customer_segments,open("./ref_result/additional_customer_segments.pkl","wb"))


